My objective is to do my best to predict the prices of houses from the Kaggle dataset with the time available using linear regression.

There’s a lot of categorical data, so I’m going to figure out how I want to handle it.

View(house)

I organized the description file so it’s easier to read. I should have just pasted it into Excel. I wasted a lot of time there.

This code lets me see where the NAs are:

empty_list <- rep(0, length(house[1,]) - 1)

NAcounts <- data.frame(empty_list, empty_list, empty_list)
colnames(NAcounts) <- c('Column', 'NA_Counts', 'NA_Ratio')

for (i in 1:length(house[1,])) {
  NAcounts[i,1] <- as.character(colnames(house))[i]
  NAcounts[i,2] <- sum(is.na(house[,i]))
  NAcounts[i,3] <- mean(is.na(house[,i]))
}

# View(NAcounts)
datatable(NAcounts)

Lets do this:

# HouseSummary <- cbind(NAcounts, empty_list)
# colnames(HouseSummary) <- c('Column', 'NA_Counts', 'NA_Ratio', 'Type')
# 
# house_t <- tibble(house)
# 
# for (i in 1:length(house[1,])) {
#   HouseSummary[i,4] <- typeof(house_t[,i])
# }

# View(HouseSummary)
# This didn't work for me.
str(house)

Well, lets just get to work.

# PoolQC is mostly NAs
table(house$PoolQC)

# MiscFeature is also a rare occurance
table(house$MiscFeature)

# This is good for now
table(house$MiscVal)

# I could label encode this
table(house$Alley)

# Label encode
table(house$Fence)

# Grab values
table(house$FireplaceQu) # this is better than using unique() because I can check the counts

# This is quantitative. "Linear feet of street connected to property"
# What does an NA mean? Probably just missing. There are enough NAs that I don't want to drop this column.
# I'm going to impute the median.
unique(house$LotFrontage)
lotfront_med <- median(drop_na(house, LotFrontage)$LotFrontage)

# I would like to just collapse it to a binary 'HasGarage'. But I want this to be quality data.
# Note: In every place that these are NA it means there is no garage.
table(house$GarageQual) # some are VERY short on observations
table(house$GarageCond) # some are VERY short on observations
table(house$GarageType) # some of these are real short on observations
table(house$GarageYrBlt) # Not sure how to deal with this one. I'll match the year built date.
table(house$GarageFinish) # just fine

# Will assume NAs are 'None' and 0, respectively
table(house$MasVnrType)
table(house$MasVnrArea)

# Hmm. And this has one NA.
table(house$Electrical)

# These are factors
table(house$MSSubClass)
palette(c("red","red","red","blue","blue","orange","orange","black","blue","blue","red","red","blue","orange","blue","blue"))
plot(log(SalePrice) ~ MSSubClass, data = house, col = factor(MSSubClass))

plot(log(SalePrice) ~ YearBuilt, data = house, col = factor(MSSubClass))

palette("default")
plot(log(SalePrice) ~ YearBuilt, data = house, col = factor(MSSubClass))

I won’t worry about this in my model

Cleaning

house2 <- house %>% 
  mutate(HasPool = !is.na(PoolQC)) %>%  
  select(-PoolQC) %>% 
  mutate(HasBonus = !is.na(MiscFeature)) %>% 
  select(-MiscFeature) %>% 
  mutate(Alley = ifelse(is.na(Alley), 0, ifelse(Alley == 'Grvl', 1, 2))) %>%
  mutate(Fence = ifelse(Fence == 'GdPrv', 4, ifelse(Fence == 'MnPrv', 3, ifelse(Fence == 'GdWo', 2, ifelse(Fence == 'MnWw', 1, 0))))) %>% 
  mutate(Fence = replace_na(Fence, 0)) %>% # ifelse didn't seem to like commands like is.na()
  mutate(FireplaceQu = case_when(FireplaceQu == 'Ex' ~ 5, FireplaceQu == 'Gd' ~ 4, FireplaceQu == 'TA' ~ 3, FireplaceQu == 'Fa' ~ 2, FireplaceQu == 'Po' ~ 1)) %>% 
  mutate(FireplaceQu = replace_na(FireplaceQu, 0)) %>% 
  mutate(LotFrontage = replace_na(LotFrontage, lotfront_med)) %>% # Can't use this one for Math 425
  mutate(GarageType = fct_explicit_na(GarageType, 'None')) %>% # Useful little function
  mutate(GarageFinish = case_when(GarageFinish == 'Fin' ~ 3, GarageFinish == 'Rfn' ~ 2, GarageFinish == 'Unf' ~ 1)) %>% 
  mutate(GarageFinish = replace_na(GarageFinish, 0)) %>% 
  mutate(GarageCond = case_when(GarageCond == 'Ex' | GarageCond == 'Gd' ~ 3, GarageCond == 'TA' ~ 2, GarageCond == 'Fa' | GarageCond == 'Po' ~ 1)) %>% 
  mutate(GarageCond = replace_na(GarageCond, 0)) %>% 
  mutate(GarageQual = case_when(GarageQual == 'Ex' | GarageQual == 'Gd' ~ 3, GarageQual == 'TA' ~ 2, GarageQual == 'Fa' | GarageQual == 'Po' ~ 1)) %>% 
  mutate(GarageQual = replace_na(GarageQual, 0)) %>% 
  mutate(MasVnrType = replace_na(MasVnrType, 'None')) %>%  # This works because 'None' is already a factor
  mutate(MasVnrArea = replace_na(MasVnrArea, 0)) %>% 
  select(-Electrical) # I'm not too worried.
  
# This method works, so why not?
house2$GarageYrBlt[is.na(house2$GarageYrBlt)] <- house2$YearBuilt[is.na(house2$GarageYrBlt)] # Having imputed those values, I can't use GarageYrBlt for a model in Math 425.

# At this rate I won't get it done, so unfortunately I have to move on to building a model.

#View(house2)

Exploring a little

# Just checking 
summary(lm(house$SalePrice ~ house$Id))
## 
## Call:
## lm(formula = house$SalePrice ~ house$Id)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -146990  -51260  -18034   32712  573920 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 183937.935   4160.774  44.208   <2e-16 ***
## house$Id        -4.130      4.934  -0.837    0.403    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 79450 on 1458 degrees of freedom
## Multiple R-squared:  0.0004803,  Adjusted R-squared:  -0.0002052 
## F-statistic: 0.7007 on 1 and 1458 DF,  p-value: 0.4027
# Sample, because this takes too long
pairs(sample_n(cbind(house2$SalePrice,house2[,2:10]), 300))

tibble(house2)
## # A tibble: 1,460 x 1
##    house2$Id $MSSubClass $MSZoning $LotFrontage $LotArea $Street $Alley
##        <int>       <int> <fct>            <int>    <int> <fct>    <dbl>
##  1         1          60 RL                  65     8450 Pave         0
##  2         2          20 RL                  80     9600 Pave         0
##  3         3          60 RL                  68    11250 Pave         0
##  4         4          70 RL                  60     9550 Pave         0
##  5         5          60 RL                  84    14260 Pave         0
##  6         6          50 RL                  85    14115 Pave         0
##  7         7          20 RL                  75    10084 Pave         0
##  8         8          60 RL                  69    10382 Pave         0
##  9         9          50 RM                  51     6120 Pave         0
## 10        10         190 RL                  50     7420 Pave         0
## # ... with 1,450 more rows, and 73 more variables: $LotShape <fct>,
## #   $LandContour <fct>, $Utilities <fct>, $LotConfig <fct>,
## #   $LandSlope <fct>, $Neighborhood <fct>, $Condition1 <fct>,
## #   $Condition2 <fct>, $BldgType <fct>, $HouseStyle <fct>,
## #   $OverallQual <int>, $OverallCond <int>, $YearBuilt <int>,
## #   $YearRemodAdd <int>, $RoofStyle <fct>, $RoofMatl <fct>,
## #   $Exterior1st <fct>, $Exterior2nd <fct>, $MasVnrType <fct>,
## #   $MasVnrArea <dbl>, $ExterQual <fct>, $ExterCond <fct>,
## #   $Foundation <fct>, $BsmtQual <fct>, $BsmtCond <fct>,
## #   $BsmtExposure <fct>, $BsmtFinType1 <fct>, $BsmtFinSF1 <int>,
## #   $BsmtFinType2 <fct>, $BsmtFinSF2 <int>, $BsmtUnfSF <int>,
## #   $TotalBsmtSF <int>, $Heating <fct>, $HeatingQC <fct>,
## #   $CentralAir <fct>, $X1stFlrSF <int>, $X2ndFlrSF <int>,
## #   $LowQualFinSF <int>, $GrLivArea <int>, $BsmtFullBath <int>,
## #   $BsmtHalfBath <int>, $FullBath <int>, $HalfBath <int>,
## #   $BedroomAbvGr <int>, $KitchenAbvGr <int>, $KitchenQual <fct>,
## #   $TotRmsAbvGrd <int>, $Functional <fct>, $Fireplaces <int>,
## #   $FireplaceQu <dbl>, $GarageType <fct>, $GarageYrBlt <int>,
## #   $GarageFinish <dbl>, $GarageCars <int>, $GarageArea <int>,
## #   $GarageQual <dbl>, $GarageCond <dbl>, $PavedDrive <fct>,
## #   $WoodDeckSF <int>, $OpenPorchSF <int>, $EnclosedPorch <int>,
## #   $X3SsnPorch <int>, $ScreenPorch <int>, $PoolArea <int>, $Fence <dbl>,
## #   $MiscVal <int>, $MoSold <int>, $YrSold <int>, $SaleType <fct>,
## #   $SaleCondition <fct>, $SalePrice <int>, $HasPool <lgl>,
## #   $HasBonus <lgl>
#View(abs(cor(house))[,81])

plot(log(SalePrice) ~ Street, data = house2)# Street is worth keeping.

plot(log(SalePrice) ~ YearBuilt, col = factor(Alley), data = house2, pch = 16)

plot(log(SalePrice) ~ Alley, data = house2)

plot(log(SalePrice) ~ LotShape, data = house) # There's a trend, but I'm going to ignore it.

plot(log(SalePrice) ~ YearBuilt, data = house2, col = MSZoning)

house2 %>% 
  ggplot(aes(y = log(SalePrice), x = YearBuilt, col = MSZoning)) + 
  geom_point() +
  facet_wrap(house2$MSZoning)

# This also takes too long. I can't do this sort of thing with all of the columns

Pairs plots

colnames(house2)
##  [1] "Id"            "MSSubClass"    "MSZoning"      "LotFrontage"  
##  [5] "LotArea"       "Street"        "Alley"         "LotShape"     
##  [9] "LandContour"   "Utilities"     "LotConfig"     "LandSlope"    
## [13] "Neighborhood"  "Condition1"    "Condition2"    "BldgType"     
## [17] "HouseStyle"    "OverallQual"   "OverallCond"   "YearBuilt"    
## [21] "YearRemodAdd"  "RoofStyle"     "RoofMatl"      "Exterior1st"  
## [25] "Exterior2nd"   "MasVnrType"    "MasVnrArea"    "ExterQual"    
## [29] "ExterCond"     "Foundation"    "BsmtQual"      "BsmtCond"     
## [33] "BsmtExposure"  "BsmtFinType1"  "BsmtFinSF1"    "BsmtFinType2" 
## [37] "BsmtFinSF2"    "BsmtUnfSF"     "TotalBsmtSF"   "Heating"      
## [41] "HeatingQC"     "CentralAir"    "X1stFlrSF"     "X2ndFlrSF"    
## [45] "LowQualFinSF"  "GrLivArea"     "BsmtFullBath"  "BsmtHalfBath" 
## [49] "FullBath"      "HalfBath"      "BedroomAbvGr"  "KitchenAbvGr" 
## [53] "KitchenQual"   "TotRmsAbvGrd"  "Functional"    "Fireplaces"   
## [57] "FireplaceQu"   "GarageType"    "GarageYrBlt"   "GarageFinish" 
## [61] "GarageCars"    "GarageArea"    "GarageQual"    "GarageCond"   
## [65] "PavedDrive"    "WoodDeckSF"    "OpenPorchSF"   "EnclosedPorch"
## [69] "X3SsnPorch"    "ScreenPorch"   "PoolArea"      "Fence"        
## [73] "MiscVal"       "MoSold"        "YrSold"        "SaleType"     
## [77] "SaleCondition" "SalePrice"     "HasPool"       "HasBonus"
pairs(sample_n(cbind(log(house2$SalePrice),house2[,2:10]), 200))

# I'll take LotArea. #5
pairs(sample_n(cbind(log(house2$SalePrice),house2[,11:20]), 200))

# OverallQual, YearBuilt #18, 20
pairs(sample_n(cbind(log(house2$SalePrice),house2[,21:30]), 200))

# maybe YearRemodAdd, maybe MasVnrArea #21, 27
pairs(sample_n(cbind(log(house2$SalePrice),house2[,31:40]), 200))

# TotalBsmtSF #39
pairs(sample_n(cbind(log(house2$SalePrice),house2[,41:50]), 200))

# X1stFlrSF, X2ndFlrSF, GrLivArea #43, 44, 46
pairs(sample_n(cbind(log(house2$SalePrice),house2[,51:60]), 200))

# TotRmsAbvGrd #54
pairs(sample_n(cbind(log(house2$SalePrice),house2[,61:70]), 200))

# GarageArea #62
pairs(sample_n(cbind(log(house2$SalePrice),house2[,71:80]), 200))

# HasPool, HasBonus #79, 80 'table(house2$HasPool)': There are only 7 houses with a pool.

Code you’ll thank me for not running in this document:

summary(lm(log(SalePrice) ~ LotArea, data = house2))
# adj_r2 = 0.06557
summary(lm(log(SalePrice) ~ YearBuilt, data = house2))
# Adjusted R-squared:  0.3436
summary(lm(log(SalePrice) ~ LotArea + YearBuilt, data = house2))
# adj_r2 = 0.4053
summary(lm(log(SalePrice) ~ YearBuilt + LotArea:YearBuilt, data = house2))
# adj_r2 = 0.4054
summary(lm(log(SalePrice) ~ LotArea + YearBuilt + LotArea:YearBuilt, data = house2))
# adj_r2 = 0.4056 #these hardly increase
summary(lm(log(SalePrice) ~ LotArea + YearBuilt + OverallQual, data = house2))
# Adjusted R-squared:  0.7208
summary(lm(log(SalePrice) ~ LotArea + YearBuilt + YearBuilt:OverallQual, data = house2))
# Adjusted R-squared:  0.7207
summary(lm(log(SalePrice) ~ LotArea + YearBuilt + OverallQual + TotalBsmtSF, data = house2))
# Adjusted R-squared:  0.7425
summary(lm(log(SalePrice) ~ LotArea + YearBuilt + OverallQual + TotalBsmtSF + X1stFlrSF, data = house2))
# Adjusted R-squared:  0.757
summary(lm(log(SalePrice) ~ LotArea + YearBuilt + OverallQual + TotalBsmtSF + X1stFlrSF + X2ndFlrSF, data = house2))
# Adjusted R-squared:  0.8055
summary(lm(log(SalePrice) ~ LotArea + YearBuilt + OverallQual + TotalBsmtSF + X1stFlrSF + X2ndFlrSF + GrLivArea, data = house2))
# Adjusted R-squared:  0.8054  That didn't increase it and I think because this is a lot of the same information.
summary(lm(log(SalePrice) ~ LotArea + YearBuilt + OverallQual + TotalBsmtSF + X1stFlrSF + X2ndFlrSF + X1stFlrSF:X2ndFlrSF, data = house2))
# Adjusted R-squared:  0.8111  I might keep that.
summary(lm(log(SalePrice) ~ LotArea + YearBuilt + OverallQual + TotalBsmtSF + X1stFlrSF + X2ndFlrSF + X1stFlrSF:X2ndFlrSF + GarageArea, data = house2))
# Adjusted R-squared:   0.82
summary(lm(log(SalePrice) ~ LotArea + YearBuilt + OverallQual + TotalBsmtSF + X1stFlrSF + X2ndFlrSF + X1stFlrSF:X2ndFlrSF + GarageArea + YearRemodAdd, data = house2))
# Adjusted R-squared:  0.8276
summary(lm(log(SalePrice) ~ LotArea + YearBuilt + OverallQual + TotalBsmtSF + X1stFlrSF + X2ndFlrSF + X1stFlrSF:X2ndFlrSF + GarageArea + YearRemodAdd + MasVnrArea, data = house2))
# Adjusted R-squared:  0.8276 # No increase
summary(lm(log(SalePrice) ~ MasVnrArea, data = house2))

# let's pause to check some of this:

## Not using this:
# adj_r2s <- rep(0, 12)
# i <- 1
# 
# for (attribute in c(log(house2$SalePrice), house2$LotArea, house2$YearBuilt, house2$OverallQual, house2$TotalBsmtSF, house2$X1stFlrSF, house2$X2ndFlrSF, house2$GarageArea, house2$YearRemodAdd, house2$MasVnrArea, house2$HasPool, house2$HasBonus)) {
#   lm0 <- lm(log(house2$SalePrice) ~ attribute)
#   adj_r2s[i] <- 
#
# }
#View(cbind(Attribute = c("log(house2$SalePrice)", "house2$LotArea", "house2$YearBuilt", "house2$OverallQual", "house2$TotalBsmtSF", "house2$X1stFlrSF", "house2$X2ndFlrSF", "house2$GarageArea", "house2$YearRemodAdd", "house2$MasVnrArea", "house2$HasPool", "house2$HasBonus"), '|Correlation|' = abs(cor(cbind(log(house2$SalePrice), house2$LotArea, house2$YearBuilt, house2$OverallQual, house2$TotalBsmtSF, house2$X1stFlrSF, house2$X2ndFlrSF, house2$GarageArea, house2$YearRemodAdd, house2$MasVnrArea, house2$HasPool, house2$HasBonus)))[,1]))

house3 <- data.frame(LogPrice = log(house2$SalePrice), Lot = house2$LotArea, Year = house2$YearBuilt, Quality = house2$OverallQual, Basement = house2$TotalBsmtSF, X1stFlr = house2$X1stFlrSF, X2ndFlr = house2$X2ndFlrSF, Garage = house2$GarageArea, Remodel = house2$YearRemodAdd, Masonry = house2$MasVnrArea, Pool = house2$HasPool, Bonus = house2$HasBonus)

# pairs(house3)

house3 <- cbind(house3, MasonType = house2$MasVnrType) %>% 
  mutate(Stone = MasonType == 'Stone') %>% 
  mutate(BrkCmn = MasonType == 'BrkCmn') %>% 
  mutate(BrkFace = MasonType == 'BrkFace') %>% 
  mutate(MasonType = case_when(MasonType == 'None' ~ 0, MasonType == 'BrkCmn' ~ 1, MasonType == 'BrkFace' ~ 2, MasonType == 'Stone' ~ 3))

pairs(sample_n(house3, 250), col = factor(house3$MasonType))

I was looking for the wrong thing here.

Some more test models

summary(lm(LogPrice ~ Lot + Year + Quality + Basement + X1stFlr + X2ndFlr + X1stFlr:X2ndFlr + Garage + Remodel + Stone:Masonry, data = house3)) # The Masonry significance levels aren't that high.

summary(lm(LogPrice ~ Lot + Year + Quality + Basement + X1stFlr + X2ndFlr + X1stFlr:X2ndFlr + Garage + Remodel + Stone:Masonry + Pool + Bonus, data = house3))

summary(lm(LogPrice ~ Lot + Year + Quality + Basement + X1stFlr + X2ndFlr + X1stFlr:X2ndFlr + Garage + Remodel + Stone:Masonry + Pool, data = house3))

summary(lm(LogPrice ~ Lot + Year + Quality + Basement + X1stFlr + X2ndFlr + X1stFlr:X2ndFlr + Garage + Remodel + Pool, data = house3)) # Best score of all of these

summary(lm(LogPrice ~ Year + Quality + Basement + X1stFlr + X2ndFlr + X1stFlr:X2ndFlr + Garage + Remodel + Pool, data = house3)) # Nope. It's better with Lot.

# A couple more things though:
summary(lm(LogPrice ~ Year + Quality + Basement + X1stFlr + X2ndFlr + X1stFlr:X2ndFlr + Garage + Remodel + Pool + Masonry:MasonType, data = house3))

summary(lm(LogPrice ~ Year + Quality + Basement + X1stFlr + X2ndFlr + Garage + Remodel + Pool, data = house3))

This is my Final Model:

my_model <- lm(LogPrice ~ Lot + Year + Quality + Basement + X1stFlr + X2ndFlr + X1stFlr:X2ndFlr + Garage + Remodel + Pool, data = house3)

summary(my_model)
## 
## Call:
## lm(formula = LogPrice ~ Lot + Year + Quality + Basement + X1stFlr + 
##     X2ndFlr + X1stFlr:X2ndFlr + Garage + Remodel + Pool, data = house3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.98555 -0.07294  0.00885  0.08860  0.59656 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      2.712e+00  5.244e-01   5.171 2.65e-07 ***
## Lot              3.383e-06  4.637e-07   7.296 4.86e-13 ***
## Year             1.829e-03  2.054e-04   8.908  < 2e-16 ***
## Quality          9.672e-02  5.200e-03  18.602  < 2e-16 ***
## Basement         8.897e-05  1.870e-05   4.759 2.14e-06 ***
## X1stFlr          3.252e-04  2.292e-05  14.193  < 2e-16 ***
## X2ndFlr          4.198e-04  3.454e-05  12.155  < 2e-16 ***
## Garage           2.334e-04  2.693e-05   8.667  < 2e-16 ***
## Remodel          2.222e-03  2.760e-04   8.048 1.73e-15 ***
## PoolTRUE        -1.829e-01  6.627e-02  -2.759  0.00587 ** 
## X1stFlr:X2ndFlr -1.519e-07  2.564e-08  -5.925 3.90e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1655 on 1449 degrees of freedom
## Multiple R-squared:  0.8296, Adjusted R-squared:  0.8284 
## F-statistic: 705.3 on 10 and 1449 DF,  p-value: < 2.2e-16
plot(my_model, which = 1)

I have no way to interpret that plot.

How about this though?

lot_lm <- lm(LogPrice ~ Lot, data = house3)
plot(lot_lm, which = 1)

boxCox(lot_lm)

lot_lm2 <- lm(LogPrice ~ log(Lot), data = house3)
plot(lot_lm2, which = 1)

lot_lm3 <- lm(LogPrice ~ I(Lot^2), data = house3)
plot(lot_lm3, which = 1)

# I like Logging it better
summary(lot_lm2)
## 
## Call:
## lm(formula = LogPrice ~ log(Lot), data = house3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.55029 -0.23028 -0.02586  0.24477  1.32863 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  9.21133    0.16910   54.47   <2e-16 ***
## log(Lot)     0.30872    0.01853   16.66   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3662 on 1458 degrees of freedom
## Multiple R-squared:  0.1599, Adjusted R-squared:  0.1594 
## F-statistic: 277.6 on 1 and 1458 DF,  p-value: < 2.2e-16
my_model2 <- lm(LogPrice ~ log(Lot) + Year + Quality + Basement + X1stFlr + X2ndFlr + X1stFlr:X2ndFlr + Garage + Remodel + Pool, data = house3)

summary(my_model2) # Yeah, that's more like it.
## 
## Call:
## lm(formula = LogPrice ~ log(Lot) + Year + Quality + Basement + 
##     X1stFlr + X2ndFlr + X1stFlr:X2ndFlr + Garage + Remodel + 
##     Pool, data = house3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.85259 -0.07168  0.01186  0.08409  0.61679 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      1.172e+00  5.244e-01   2.234  0.02562 *  
## log(Lot)         1.206e-01  9.485e-03  12.711  < 2e-16 ***
## Year             1.992e-03  1.989e-04  10.015  < 2e-16 ***
## Quality          1.003e-01  5.032e-03  19.933  < 2e-16 ***
## Basement         9.104e-05  1.802e-05   5.052 4.92e-07 ***
## X1stFlr          2.744e-04  2.268e-05  12.100  < 2e-16 ***
## X2ndFlr          4.000e-04  3.341e-05  11.972  < 2e-16 ***
## Garage           1.871e-04  2.635e-05   7.099 1.96e-12 ***
## Remodel          2.332e-03  2.668e-04   8.740  < 2e-16 ***
## PoolTRUE        -1.770e-01  6.399e-02  -2.766  0.00575 ** 
## X1stFlr:X2ndFlr -1.499e-07  2.477e-08  -6.053 1.81e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1598 on 1449 degrees of freedom
## Multiple R-squared:  0.841,  Adjusted R-squared:  0.8399 
## F-statistic: 766.6 on 10 and 1449 DF,  p-value: < 2.2e-16

Much better

Validation:

# I realize I didn't explore with a sample of the data, but I can still test by creating the model from a smaller sample.
training_rows <- sample(1:length(house3$LogPrice), length(house3$LogPrice)*0.8)

training_data <- house3[training_rows,]
training_data$Price <- exp(training_data$LogPrice) # I had logged the prices, after all. This is y_hat_i

testing_data <- house3[-training_rows,]
testing_data$Price <- exp(testing_data$LogPrice)

final_model <- lm(log(Price) ~ log(Lot) + Year + Quality + Basement + X1stFlr + X2ndFlr + X1stFlr:X2ndFlr + Garage + Remodel + Pool, data = training_data)

summary(final_model)
## 
## Call:
## lm(formula = log(Price) ~ log(Lot) + Year + Quality + Basement + 
##     X1stFlr + X2ndFlr + X1stFlr:X2ndFlr + Garage + Remodel + 
##     Pool, data = training_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.73879 -0.07425  0.01211  0.08452  0.69049 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      1.190e+00  6.019e-01   1.978 0.048186 *  
## log(Lot)         1.257e-01  1.097e-02  11.457  < 2e-16 ***
## Year             2.048e-03  2.280e-04   8.985  < 2e-16 ***
## Quality          1.024e-01  5.763e-03  17.774  < 2e-16 ***
## Basement         7.934e-05  2.057e-05   3.857 0.000121 ***
## X1stFlr          2.740e-04  2.597e-05  10.553  < 2e-16 ***
## X2ndFlr          4.127e-04  3.764e-05  10.964  < 2e-16 ***
## Garage           1.957e-04  2.973e-05   6.581 7.07e-11 ***
## Remodel          2.242e-03  3.053e-04   7.341 3.99e-13 ***
## PoolTRUE        -2.130e-01  7.922e-02  -2.689 0.007265 ** 
## X1stFlr:X2ndFlr -1.624e-07  2.764e-08  -5.876 5.47e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1631 on 1157 degrees of freedom
## Multiple R-squared:  0.839,  Adjusted R-squared:  0.8376 
## F-statistic: 603.1 on 10 and 1157 DF,  p-value: < 2.2e-16
predictions <- exp(predict.lm(final_model, new_data = testing_data)) # I must be doing something wrong here.

mean_price <- mean(testing_data$Price) # y_bar
targets <- testing_data$Price # y_i
SSE <- sum(predictions - targets)^2 # Sum of squared residuals
SSTO <- sum(targets - mean_price)^2 # Total sum of squares

DF <- length(targets) - length(final_model$coefficients) # Degrees of freedom

print("Adjusted R-Squared:")
## [1] "Adjusted R-Squared:"
(Adjusted_R_Squared <- 1 - (length(targets) - 1)/DF * SSE/SSTO)
## [1] -2.471374e+30

Troubleshooting

> predict(final_model, data.frame(training_data[2,]))
    1343 
12.52126 
> exp(predict(final_model, data.frame(training_data[2,])))
    1343 
274103.6 
> training_data[2,]
     LogPrice  Lot Year Quality Basement X1stFlr X2ndFlr Garage Remodel Masonry  Pool Bonus MasonType Stone BrkCmn BrkFace

1343 12.33929 9375 2002 8 1284 1284 885 647 2002 149 FALSE FALSE 2 FALSE FALSE TRUE Price 1343 228500

> predict(final_model, data.frame(testing_data[2,]))
  11 
11.75375 
> exp(predict(final_model, data.frame(testing_data[2,])))
    11 
127230 
> testing_data[2,]
   LogPrice   Lot Year Quality Basement X1stFlr X2ndFlr Garage Remodel Masonry  Pool Bonus MasonType Stone BrkCmn BrkFace
11 11.77144 11200 1965       5     1040    1040       0    384    1965       0 FALSE FALSE         0 FALSE  FALSE   FALSE
    Price
11 129500

> sum(predictions - targets)^2
[1] 5.036271e+13
> sum((predictions - targets)^2)
[1] 1.448868e+13

I might have figured this out now:

mean_price <- mean(testing_data$Price) # y_bar
targets <- testing_data$Price # y_i
SSE <- sum((predictions - targets)^2) # Sum of squared residuals
SSTO <- sum((targets - mean_price)^2) # Total sum of squares

DF <- length(targets) - length(final_model$coefficients) # Degrees of freedom

print("Adjusted R-Squared:")
## [1] "Adjusted R-Squared:"
(Adjusted_R_Squared <- 1 - (length(targets) - 1)/DF * SSE/SSTO)
## [1] -8.288766

That’s better, but still pretty bad. Maybe I can look at what the normal R-Squared is:

print("R-Squared:")
## [1] "R-Squared:"
(Adjusted_R_Squared <- 1 - SSE/SSTO)
## [1] -7.969565

Well, according to this, I’m really overfitting.

new_final_model <- lm(log(Price) ~ log(Lot) + Year + Quality + Basement + X1stFlr + Garage + X2ndFlr, data = training_data)

summary(new_final_model)
## 
## Call:
## lm(formula = log(Price) ~ log(Lot) + Year + Quality + Basement + 
##     X1stFlr + Garage + X2ndFlr, data = training_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.17717 -0.07435  0.01223  0.08916  0.52071 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 4.078e+00  4.378e-01   9.316  < 2e-16 ***
## log(Lot)    1.231e-01  1.145e-02  10.751  < 2e-16 ***
## Year        2.856e-03  2.182e-04  13.094  < 2e-16 ***
## Quality     1.192e-01  5.783e-03  20.614  < 2e-16 ***
## Basement    5.568e-05  2.133e-05   2.610  0.00917 ** 
## X1stFlr     2.078e-04  2.482e-05   8.373  < 2e-16 ***
## Garage      2.027e-04  3.107e-05   6.523 1.03e-10 ***
## X2ndFlr     1.996e-04  1.381e-05  14.460  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1706 on 1160 degrees of freedom
## Multiple R-squared:  0.8234, Adjusted R-squared:  0.8224 
## F-statistic: 772.8 on 7 and 1160 DF,  p-value: < 2.2e-16
predictions <- exp(predict.lm(final_model, new_data = testing_data))

mean_price <- mean(testing_data$Price) # y_bar
targets <- testing_data$Price # y_i
SSE <- sum((predictions - targets)^2) # Sum of squared residuals
SSTO <- sum((targets - mean_price)^2) # Total sum of squares

DF <- length(targets) - length(final_model$coefficients) # Degrees of freedom

print("Adjusted R-Squared:")
## [1] "Adjusted R-Squared:"
(Adjusted_R_Squared <- 1 - (length(targets) - 1)/DF * SSE/SSTO)
## [1] -8.288766
new_final_model <- lm(log(Price) ~ Year, data = training_data)

summary(new_final_model)
## 
## Call:
## lm(formula = log(Price) ~ Year, data = training_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.24874 -0.19009 -0.03155  0.16857  1.67141 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.5983295  0.6227112  -5.778 9.66e-09 ***
## Year         0.0079271  0.0003158  25.100  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3262 on 1166 degrees of freedom
## Multiple R-squared:  0.3508, Adjusted R-squared:  0.3502 
## F-statistic:   630 on 1 and 1166 DF,  p-value: < 2.2e-16
predictions <- exp(predict.lm(final_model, new_data = data.frame(testing_data)))

mean_price <- mean(testing_data$Price) # y_bar
targets <- testing_data$Price # y_i
SSE <- sum((predictions - targets)^2) # Sum of squared residuals
SSTO <- sum((targets - mean_price)^2) # Total sum of squares

DF <- length(targets) - length(final_model$coefficients) # Degrees of freedom

print("Adjusted R-Squared:")
## [1] "Adjusted R-Squared:"
(Adjusted_R_Squared <- 1 - (length(targets) - 1)/DF * SSE/SSTO)
## [1] -8.288766

I’m clearly doing something wrong with the predict function.

Plots

smaller_data <- sample_n(house3, 400)

plot(LogPrice ~ Year, data = smaller_data)
abline(lm(LogPrice ~ Year, data = house3))

plot(LogPrice ~ Quality, data = smaller_data)
abline(lm(LogPrice ~ Quality, data = house3))

plot(LogPrice ~ Basement, data = smaller_data)
abline(lm(LogPrice ~ Basement, data = house3))

plot(LogPrice ~ X1stFlr, data = smaller_data)
abline(lm(LogPrice ~ X1stFlr, data = house3))

plot(LogPrice ~ X2ndFlr, data = smaller_data)
abline(lm(LogPrice ~ X2ndFlr, data = house3))

The graph for the second floor shows that the line is tilted to account for all the single floor houses. It would be better if I had separated this in the model.

library(plotly) # Code for 3D plot below coppied from notebook
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
#Perform the multiple regression
three_d_lm <- lm(LogPrice ~ X1stFlr + X2ndFlr + X1stFlr:X2ndFlr, data = house3)

#Graph Resolution (more important for more complex shapes)
graph_reso <- 20

#Setup Axis
axis_x <- seq(min(smaller_data$X1stFlr), max(smaller_data$X1stFlr), by = graph_reso)
axis_y <- seq(min(smaller_data$X2ndFlr), max(smaller_data$X2ndFlr), by = graph_reso)

#Sample points
house_surface <- expand.grid(X1stFlr = axis_x, X2ndFlr = axis_y, KEEP.OUT.ATTRS = F)
house_surface$Z <- predict.lm(three_d_lm, newdata = house_surface)
house_surface <- acast(house_surface, X2ndFlr ~ X1stFlr, value.var = "Z") #y ~ x

#Create scatterplot
plot_ly(smaller_data, 
        x = ~X1stFlr, 
        y = ~X2ndFlr, 
        z = ~LogPrice,
        text = rownames(smaller_data), 
        type = "scatter3d", 
        mode = "markers") %>%
  add_trace(z = house_surface,
            x = axis_x,
            y = axis_y,
            type = "surface")
## Warning: 'surface' objects don't have these attributes: 'mode'
## Valid attributes include:
## 'type', 'visible', 'showlegend', 'legendgroup', 'opacity', 'name', 'uid', 'ids', 'customdata', 'selectedpoints', 'hoverinfo', 'hoverlabel', 'stream', 'transforms', 'z', 'x', 'y', 'text', 'surfacecolor', 'cauto', 'cmin', 'cmax', 'colorscale', 'autocolorscale', 'reversescale', 'showscale', 'colorbar', 'contours', 'hidesurface', 'lightposition', 'lighting', '_deprecated', 'xcalendar', 'ycalendar', 'zcalendar', 'scene', 'idssrc', 'customdatasrc', 'hoverinfosrc', 'zsrc', 'xsrc', 'ysrc', 'textsrc', 'surfacecolorsrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'
plot(LogPrice ~ Garage, data = smaller_data)
abline(lm(LogPrice ~ Garage, data = house3))

plot(LogPrice ~ Remodel, data = smaller_data)
abline(lm(LogPrice ~ Remodel, data = house3))

palette(c("firebrick", "skyblue"))
plot(LogPrice ~ Year, data = house3, col = factor(house3$Pool), pch = 16, cex = 0.5, main = "Blue dots mean the house has a pool")
pool_lm <- lm(LogPrice ~ Year + Pool, data = house3)
abline(pool_lm$coef[1], pool_lm$coef[2], lwd = 2, col = "firebrick")
abline(pool_lm$coef[1] + pool_lm$coef[3], pool_lm$coef[2], lwd = 2, col = "skyblue")

# my_model2 <- lm(LogPrice ~ log(Lot) + Year + Quality + Basement + X1stFlr + X2ndFlr + X1stFlr:X2ndFlr + Garage + Remodel + Pool, data = house3)